Support <textarea> tags in Balancer.

author C. Scott Ananian <cscott@cscott.net>

Fri, 15 Jul 2016 22:46:14 +0000 (18:46 -0400)

committer Tim Starling <tstarling@wikimedia.org>

Thu, 21 Jul 2016 03:37:10 +0000 (03:37 +0000)
author C. Scott Ananian <cscott@cscott.net>
Fri, 15 Jul 2016 22:46:14 +0000 (18:46 -0400)
committer Tim Starling <tstarling@wikimedia.org>
Thu, 21 Jul 2016 03:37:10 +0000 (03:37 +0000)
diff --git a/includes/tidy/Balancer.php b/includes/tidy/Balancer.php

index 37807ba..b2d6ba1 100644 (file)
--- a/includes/tidy/Balancer.php
+++ b/includes/tidy/Balancer.php
@@ -75,7 +75,7 @@ class BalanceSets {
                 self::HTML_NAMESPACE => [
                         'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
                         'frame' => true,
-                       'plaintext' => true, 'isindex' => true, 'textarea' => true,
+                       'plaintext' => true, 'isindex' => true,
                         'xmp' => true, 'iframe' => true, 'noembed' => true,
                         'noscript' => true, 'script' => true,
                         'title' => true
@@ -92,6 +92,12 @@ class BalanceSets {
                 ]
         ];
  
+       public static $extraLinefeedSet = [
+               self::HTML_NAMESPACE => [
+                       'pre' => true, 'textarea' => true, 'listing' => true,
+               ]
+       ];
+
         public static $headingSet = [
                 self::HTML_NAMESPACE => [
                         'h1' => true, 'h2' => true, 'h3' => true,
@@ -513,11 +519,21 @@ class BalanceElement {
                 }
                 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
                         $out = "<{$this->localName}{$encAttribs}>";
+                       $len = strlen( $out );
                         // flatten children
                         foreach ( $this->children as $elt ) {
                                 $out .= "{$elt}";
                         }
                         $out .= "</{$this->localName}>";
+                       if (
+                               $this->isA( BalanceSets::$extraLinefeedSet ) &&
+                               $out[$len] === "\n"
+                       ) {
+                               // Double the linefeed after pre/listing/textarea
+                               // according to the HTML5 fragment serialization algorithm.
+                               $out = substr( $out, 0, $len + 1 ) .
+                                       substr( $out, $len );
+                       }
                 } else {
                         $out = "<{$this->localName}{$encAttribs} />";
                         Assert::invariant(
@@ -1740,18 +1756,19 @@ class BalanceActiveFormattingElements {
   * - The document is never in "quirks mode".
   * - All occurrences of < and > have been entity escaped, so we
   *   can parse tags by simply splitting on those two characters.
+ *   (This also simplifies the handling of < inside <textarea>.)
   *   The character < must not appear inside comments.
   *   Similarly, all attributes have been "cleaned" and are double-quoted
   *   and escaped.
   * - All null characters are assumed to have been removed.
- * - We don't alter linefeeds after <pre>/<listing>.
   * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
- *   <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
+ *   <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
   *   <noembed>, <noscript>, <script>, <title>.  As a result,
   *   further simplifications can be made:
   *   - `frameset-ok` is not tracked.
   *   - `head element pointer` is not tracked (but presumed non-null)
- *   - Tokenizer has only a single mode.
+ *   - Tokenizer has only a single mode. (<textarea> wants RCDATA and
+ *     <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
   *
   *   We generally mark places where we omit cases from the spec due to
   *   disallowed elements with a comment: `# OMITTED: <element-name>`.
@@ -1775,11 +1792,14 @@ class Balancer {
         private $tidyCompat;
         private $allowComments;
  
-       private $textIntegrationMode = false;
+       private $textIntegrationMode;
         private $pendingTableText;
         private $originalInsertionMode;
         private $fragmentContext;
         private $formElementPointer;
+       private $ignoreLinefeed;
+       private $inRCDATA;
+       private $inRAWTEXT;
  
         /**
          * Valid HTML5 comments.
@@ -1890,6 +1910,11 @@ class Balancer {
                 $this->processingCallback = $processingCallback;
                 $this->processingArgs = $processingArgs;
  
+               $this->textIntegrationMode =
+                       $this->ignoreLinefeed =
+                       $this->inRCDATA =
+                       $this->inRAWTEXT = false;
+
                 # The stack is constructed with an <html> element already on it.
                 # Set this up as a fragment parsed with <body> as the context.
                 $this->fragmentContext =
@@ -1942,6 +1967,19 @@ class Balancer {
                         # Don't actually inject the empty string as a text token.
                         return true;
                 }
+               // Support pre/listing/textarea by suppressing initial linefeed
+               if ( $this->ignoreLinefeed ) {
+                       $this->ignoreLinefeed = false;
+                       if ( $token === 'text' ) {
+                               if ( $value[0] === "\n" ) {
+                                       if ( $value === "\n" ) {
+                                               # Nothing would be left, don't inject the empty string.
+                                               return true;
+                                       }
+                                       $value = substr( $value, 1 );
+                               }
+                       }
+               }
                 // Some hoops we have to jump through
                 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
  
@@ -2095,6 +2133,7 @@ class Balancer {
                 # are stripped in the Sanitizer) but may be generated by extensions.
                 if (
                         $this->allowComments &&
+                       !( $this->inRCDATA || $this->inRAWTEXT ) &&
                         preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
                         /* verify EOF condition where necessary */
                         ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
@@ -2129,6 +2168,22 @@ class Balancer {
                         $slash = $t = $attribStr = $brace = $rest = null;
                 }
                 $goodtag = $t;
+               if ( $this->inRCDATA ) {
+                       if ( $slash && $t === $this->inRCDATA ) {
+                               $this->inRCDATA = false;
+                       } else {
+                               // No tags allowed; this emulates the "rcdata" tokenizer mode.
+                               $goodtag = false;
+                       }
+               }
+               if ( $this->inRAWTEXT ) {
+                       if ( $slash && $t === $this->inRAWTEXT ) {
+                               $this->inRAWTEXT = false;
+                       } else {
+                               // No tags allowed, no entity-escaping done.
+                               $goodtag = false;
+                       }
+               }
                 $sanitize = $this->allowedHtmlElements !== null;
                 if ( $sanitize ) {
                         $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
@@ -2155,6 +2210,8 @@ class Balancer {
                 if ( $goodtag ) {
                         $rest = str_replace( '>', '&gt;', $rest );
                         $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
+               } elseif ( $this->inRAWTEXT ) {
+                       $this->insertToken( 'text', "<$x" );
                 } else {
                         # bad tag; serialize entire thing as text.
                         $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
@@ -2260,7 +2317,7 @@ class Balancer {
  
         private function parseRawText( $value, $attribs = null ) {
                 $this->stack->insertHTMLElement( $value, $attribs );
-               // XXX switch tokenizer to rawtext state?
+               $this->inRAWTEXT = $value;
                 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
                 return true;
         }
@@ -2431,9 +2488,8 @@ class Balancer {
                                         $this->inBodyMode( 'endtag', 'p' );
                                 }
                                 $this->stack->insertHTMLElement( $value, $attribs );
-                               # As described in "simplifications" above:
-                               # 1. We don't touch the next token, even if it's a linefeed.
-                               # 2. OMITTED: frameset_ok
+                               $this->ignoreLinefeed = true;
+                               # OMITTED: frameset_ok
                                 return true;
  
                         case 'form':
@@ -2607,7 +2663,14 @@ class Balancer {
                                 return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
  
                         # OMITTED: <isindex>
-                       # OMITTED: <textarea>
+
+                       case 'textarea':
+                               $this->stack->insertHTMLElement( $value, $attribs );
+                               $this->ignoreLinefeed = true;
+                               $this->inRCDATA = $value; // emulate rcdata tokenizer mode
+                               # OMITTED: frameset_ok
+                               return true;
+
                         # OMITTED: <xmp>
                         # OMITTED: <iframe>
                         # OMITTED: <noembed>
diff --git a/tests/phpunit/includes/tidy/BalancerTest.php b/tests/phpunit/includes/tidy/BalancerTest.php

index aa43ac7..f2e41bd 100644 (file)
--- a/tests/phpunit/includes/tidy/BalancerTest.php
+++ b/tests/phpunit/includes/tidy/BalancerTest.php
@@ -99,7 +99,6 @@ class BalancerTest extends MediaWikiTestCase {
                                         isset( $case['document']['props']['tags']['script'] ) ||
                                         isset( $case['document']['props']['tags']['svg script'] ) ||
                                         isset( $case['document']['props']['tags']['svg title'] ) ||
-                                       isset( $case['document']['props']['tags']['textarea'] ) ||
                                         isset( $case['document']['props']['tags']['title'] ) ||
                                         isset( $case['document']['props']['tags']['xmp'] )
                                 ) {
author	C. Scott Ananian <cscott@cscott.net>
	Fri, 15 Jul 2016 22:46:14 +0000 (18:46 -0400)
committer	Tim Starling <tstarling@wikimedia.org>
	Thu, 21 Jul 2016 03:37:10 +0000 (03:37 +0000)
includes/tidy/Balancer.php		patch \| blob \| history
tests/phpunit/includes/tidy/BalancerTest.php		patch \| blob \| history